// LEC April 1, 2021

// Component for data_prep.do
// Ethnic var procedure for Centennia data

////////////////////////////////////////////////////////////////////////////////// 
// Creates ethnicvars file which holds ethnic properties of sov. states including dep.
//////////////////////////////////////////////////////////////////////////////////
global SIZE "ethnicpopulationcorr"
// global SIZE "ethnicareacorr"


cd $INDIR
insheet using $SEGMENTFILE, clear

// drop if mindistancecapital == .
drop if ethnicpopulationcorr == .
drop if name == "Kazakhstan"
// drop if name == "Netherlands"

gen segcap = 0

/*
replace segcap = 1 if mindistancecapital < 0.1

bys id year: egen nsegcap = sum(segcap)

bys id year: egen mindistcap = min(mindistancecapital)
replace segcap = 1 if nsegcap==0 & mindistcap==mindistancecapital & mindistancecapital != .

drop nsegcap
bys id year: egen nsegcap = sum(segcap)

bys id year: egen maxsegcappop = max(segcap*ethnicpopulationcorr)
replace segcap = 0 if nsegcap>1
replace segcap = 1 if nsegcap>1 & maxsegcappop == ethnicpopulationcorr 
	
replace segcap = 0 if name=="Andorra" & ethnicname=="spanish [org]"

drop nsegcap
bys id year: egen nsegcap = sum(segcap)

replace segcap = 0 if name == "Slovakia"
replace segcap = 1 if name == "Slovakia" & ethnicname == "slovak [org]"

replace segcap = 0 if name == "Azerbaijan"
replace segcap = 1 if name == "Azerbaijan" & ethnicname == "azerbaijani [l4]"

*/

// Compute various ethnic vars for each state
bys id year: egen ngroups = count(ethnicid)
bys id year: egen sumpop = sum($SIZE )

bys id year: egen maxeth = max($SIZE )
gen maxethflag = 0
replace maxethflag = 1 if $SIZE ==maxeth

replace maxethflag = segcap if $CAPITAL

/*
bys id year: egen nmaxsegcap = sum(maxsegcap) 
bys id year: egen nmaxethflag = sum(maxethflag)
*/

// Compute two main ethnic indicators;
// maxshare: share of largest group
gen maxshare = maxeth/sumpop
// KEY VARIABLE: ef: ethnic fractionalization based on population shares
bys id year: egen sumsq = sum(($SIZE /sumpop)^2)
gen ef = 1 - sumsq  // note there is no default value if sumsq does not execute

gen s = $SIZE /sumpop
bys id year: egen sumpol = sum((1-s)*(s^2))
gen pol = 4 * sumpol

// Compute properties of the Aggregate Group (AG)
// number of AG segments, sum of AG population, and largest AG segment
bys ethnicid year: egen nag = count(ethnicid)
bys ethnicid year: egen sumagpop = sum($SIZE )
bys ethnicid year: egen maxag = max($SIZE )
// territorial frac. of AG (based on population shares)
bys ethnicid year: egen sumsqag = sum(($SIZE /sumagpop)^2)
gen agtf = 1 - sumsqag

gen sag = $SIZE/sumagpop
bys ethnicid year: egen sumpolag = sum((1-sag)*(sag^2))
gen agpol = 4 * sumpolag

// Compute AG properties relating to AG of the largest group: maxethid
// number of segments
bys id year: egen agn = max(maxethflag*nag)
// overall AG population
bys id year: egen agpop = max(maxethflag*sumagpop)
// population share of AG linked to largest group
bys id year: egen agmax = max(maxethflag*maxshare)
// KEY VARIABLE: AG terr. frac. related to AG of state's largest group segment
bys id year: egen agfrac = max(maxethflag*agtf)
// ID code of the AG linked to the state's largest group segment
bys id year: egen maxethid = max(maxethflag*ethnicid)
// share of state's largest group segment compared to AG population
bys id year: egen maxagshare = max(maxethflag*maxeth/sumagpop)
// name of the AG group linked to largest group segment
gen maxethname = ethnicname if maxethid==ethnicid

// Save AG data to be used by AG based routine
cd $OUTDIR

save agdata, replace

gsort id year -ethnicpopulationcorr

// Now sum up all segment data for each state
collapse (firstnm)name maxethname (max)ngroups (max)sumpop (max)maxshare ///
 (max)maxeth (max)ef (max)agn (max)agpop (max)maxagshare (max)agfrac (max)maxethid pol agpol, by (id year)
sort id year

cd $OUTDIR
save ethnicvars, replace

